heart <- read.csv("heart_2020_cleaned.csv")
Uno de los factores más claros que ha demostrado estar correlacionado con enfermedades cardíacas es la edad. ¿Pero afecta en ingual medida a tanto a hombre como a mujeres?. En este gráfico vamos a tratar de responder a preguntas como, ¿Que probabilidad hay de contraer enfermedades cardiacas siendo un varón de 40 años? o ¿Que grupo de edad las mujeres tienen más probabilidad de contraer enfermedades cardiacas que los hombres?
library(plotly)
library(dplyr)
heart %>%
group_by(AgeCategory) %>%
summarise(prob.desease.female = sum(Sex == "Female" & HeartDisease == "Yes")/sum(Sex == "Female") * 100,
prob.disease.male = sum(Sex == "Male" & HeartDisease == "Yes")/sum(Sex == "Male") * 100
) %>%
plot_ly(
x = ~AgeCategory,
y = ~prob.desease.female,
name = "Mujeres",
type = "bar",
marker = list(color = 'rgba(237, 106, 90, 1)')
) %>%
add_trace(
y = ~prob.disease.male,
name = 'Hombres',
marker = list(color = 'rgba(56, 145, 166, 1)')
) %>%
layout(
barmode = 'group',
title = "Probabilidad de contraer enfermedades cardiacas clasificados por grupo de edad y género",
xaxis = list(title = "Edad"),
yaxis = list(title = "Porcentaje"),
paper_bgcolor = 'rgb(248, 248, 255)',
plot_bgcolor = 'rgb(248, 248, 255)',
margin = c(50,50, 50, 50, 4)
)
Otro enfoque demográfico interesante, puede ser como afecta no solo el género y edad, sino la raza sobre enfermedades cardíacas, pudiendo responder a preguntas tales como ¿Que porcentaje de personas asiáticas que han sido diagnosticadas con enfermedades cardíacas son mujeres?
library(plotly)
library(dplyr)
heart %>%
group_by(Race) %>%
summarise(prob.desease.female = round(sum(Sex == "Female" & HeartDisease == "Yes")/length(Sex) * 100, 1),
prob.female = round(sum(Sex == "Female" & HeartDisease == "No")/length(Sex) * 100, 1),
prob.desease.male = round(sum(Sex == "Male" & HeartDisease == "Yes")/length(Sex) * 100, 1),
prob.male = round(sum(Sex == "Male" & HeartDisease == "No")/length(Sex) * 100, 1)
) %>%
plot_ly(
y = ~Race,
x = ~prob.desease.female,
name = "Mujeres con problemas cardíacos",
type = "bar",
orientation = 'h',
hovertemplate = '%{x}%',
marker = list(color = 'rgba(237, 106, 90, 1)')
) %>%
add_trace(
x = ~prob.female,
name = "Mujeres sin problemas cardíacos",
marker = list(color = 'rgba(237, 106, 90, 0.5)')
) %>%
add_trace(
x = ~prob.desease.male,
name = "Hombres con problemas cardíacos",
marker = list(color = 'rgba(56, 145, 166, 1)')
) %>%
add_trace(
x = ~prob.male,
name = "Hombres sin problemas cardíacos",
marker = list(color = 'rgba(56, 145, 166, 0.5)')
) %>%
layout(
barmode = 'stack',
title = "Enfermedades cardiacas clasificados por raza y género",
paper_bgcolor = 'rgb(248, 248, 255)',
plot_bgcolor = 'rgb(248, 248, 255)',
showlegend = FALSE,
margin = c(50,50, 50, 50, 4),
xaxis = list(
title = "",
showgrid = FALSE,
showline = FALSE,
showticklabels = FALSE,
zeroline = FALSE),
yaxis = list(
title = "",
showgrid = FALSE,
showline = FALSE,
showticklabels = TRUE,
ticks="outside",
ticklen=10,
zeroline = FALSE)
)
Uno de los factores claros que provocan enfermedades cardiacas son los malos hábitos, pero ¿cual es más predominante entre personas diagnosticadas con enfermedades cardíacas?
library(dplyr)
library(plotly)
x <- heart %>%
filter(HeartDisease == "Yes") %>%
summarise(smoking = sum(Smoking == "Yes") / length(HeartDisease) * 100,
alcohol = sum(AlcoholDrinking == "Yes") / length(HeartDisease) * 100,
no.activity = sum(PhysicalActivity == "No") / length(HeartDisease) * 100,
other = 100 - smoking - alcohol - no.activity
)
plot_ly(type='pie',
labels=c("Fumador", "Consumidor de alcohol", "Persona Pasiva", "Otros"),
values=as.numeric(unlist(x)),
textinfo = 'label',
hoverinfo = 'percent',
insidetextorientation = 'radial',
insidetextfont = list(color = c('#FFFFFF', '#000000', '#000000', '#000000')),
marker = list(colors = c('#ED6A5A', '#9BC1BC', '#F4F1BB', '#E6EBE0'))
) %>%
layout(
title = 'Malos hábitos en personas diagnosticadas con enfermedades cardíacas',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
margin = c(50,50, 50, 50, 4),
paper_bgcolor = 'rgba(248, 248, 255, 1)',
plot_bgcolor = 'rgb(248, 248, 255, 1)'
)
Otro factor determinante es el índice de masa corporal, pero ¿afecta éste de igual forma a mujeres como a hombres?.
library(dplyr)
library(plotly)
heart %>%
mutate(BMI.cat = case_when(BMI < 18.5 ~ "Bajo peso",
BMI >= 18.5 & BMI < 25 ~ "Normal",
BMI >= 25 & BMI < 30 ~ "Sobrepeso",
BMI >= 30 ~ "Obesidad")
) %>%
group_by(BMI.cat) %>%
summarise(prob.desease.female = round(sum(Sex == "Female" & HeartDisease == "Yes")/length(Sex) * 100, 1),
prob.desease.male = round(sum(Sex == "Male" & HeartDisease == "Yes")/length(Sex) * 100, 1),
) %>%
plot_ly(
y = ~BMI.cat,
x = ~prob.desease.female,
name = "Mujeres",
type = "bar",
orientation = 'h',
hovertemplate = '%{x}%',
marker = list(color = 'rgba(237, 106, 90, 1)')
) %>%
add_trace(
x = ~prob.desease.male,
name = "Hombres",
marker = list(color = 'rgba(56, 145, 166, 1)')
) %>%
layout(
barmode = 'group',
title = "Enfermedades cardiacas clasificados por BMI y género",
paper_bgcolor = 'rgb(248, 248, 255)',
plot_bgcolor = 'rgb(248, 248, 255)',
margin = c(50,50, 50, 50, 4),
showlegend = TRUE,
xaxis = list(
title = "",
showgrid = FALSE,
showline = FALSE,
showticklabels = FALSE,
zeroline = FALSE),
yaxis = list(
title = "",
showgrid = FALSE,
showline = FALSE,
showticklabels = TRUE,
ticks = "outside",
ticklen = 10,
zeroline = FALSE)
)
Por último se pretende observar como afectan las diferentes variables a enfermedades cardiacas de forma general, haciendo uso del algoritmo Random Forest. Para ello, se han generado un par de modelos, para mujeres y para hombres, y representado la importancia de cada uno de las variables.
library(randomForest)
library(plotly)
library(dplyr)
heart2 <- heart %>%
mutate(BMI = case_when(BMI < 18.5 ~ "Bajo peso",
BMI >= 18.5 & BMI < 25 ~ "Normal",
BMI >= 25 & BMI < 30 ~ "Sobrepeso",
BMI >= 30 ~ "Obesidad"),
SleepTime = case_when(SleepTime < 7 ~ "Insuficiente",
SleepTime >= 7 & SleepTime < 8 ~ "Recomendable",
SleepTime >= 8 ~ "Suficiente"),
MentalHealth = case_when(MentalHealth < 5 ~ "Buena",
MentalHealth >= 5 & MentalHealth < 10 ~ "Mejorable",
MentalHealth >= 10 ~ "Mala"),
PhysicalHealth = case_when(PhysicalHealth < 5 ~ "Buena",
PhysicalHealth >= 5 & PhysicalHealth < 10 ~ "Mejorable",
PhysicalHealth >= 10 ~ "Mala"),
HeartDisease = as.factor(HeartDisease)
)
set.seed(73)
rf.women <- randomForest(HeartDisease~., ntree=1, data=heart2[heart2$Sex == "Female", ][-c(9)])
rf.men <- randomForest(HeartDisease~., ntree=1, data=heart2[heart2$Sex == "Male", ][-c(9)])
plot_ly(
x = as.numeric(unlist(rf.women$importance)),
y = rownames(rf.women$importance),
name = "Women",
type = 'scatter',
mode = "markers",
marker = list(color = 'rgba(237, 106, 90, 1)')
) %>%
add_trace(
x = as.numeric(unlist(rf.men$importance)),
y = rownames(rf.men$importance),
name = "Men",
type = 'scatter',
mode = "markers",
marker = list(color = 'rgba(56, 145, 166, 1)')
) %>%
layout(
title = "Importancia de cada variable sobre HeartDisease por Random Forest",
xaxis = list(title = "MeanDrecreaseGini"),
paper_bgcolor = 'rgb(248, 248, 255)',
plot_bgcolor = 'rgb(248, 248, 255)',
margin = c(50,50, 50, 50, 4),
xaxis = list(
title = "",
showgrid = FALSE,
showline = FALSE,
showticklabels = FALSE,
zeroline = FALSE),
yaxis = list(
title = "",
showgrid = TRUE,
showline = FALSE,
showticklabels = TRUE,
zeroline = FALSE)
)